// bdlde_utf8util.h                                                   -*-C++-*-
#ifndef INCLUDED_BDLDE_UTF8UTIL
#define INCLUDED_BDLDE_UTF8UTIL

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide basic utilities for UTF-8 encodings.
//
//@CLASSES:
//  bdlde::Utf8Util: namespace for utilities for UTF-8 encodings
//
//@DESCRIPTION: This component provides, within the `bdlde::Utf8Util` `struct`,
// a suite of static functions supporting UTF-8 encoded strings.  Two
// interfaces are provided for each function, one where the length of the
// string (in *bytes*) is passed as a separate argument, and one where the
// string is passed as a null-terminated C-style string.
//
// A string is deemed to contain valid UTF-8 if it is compliant with RFC 3629,
// meaning that only 1-, 2-, 3-, and 4-byte sequences are allowed.  Values
// above `U+10ffff` are also not allowed.
//
// Seven types of functions are provided:
//
// * `isValid`, which checks for validity, per RFC 3629, of a (candidate)
//   UTF-8 string.  "Overlong values", that is, values encoded in more bytes
//   than necessary, are not tolerated; nor are "surrogate values", which are
//   values in the range `[U+d800 .. U+dfff]`.
// * `advanceIfValid` and `advanceRaw`, which advance some number of Unicode
//   code points, each of which may be encoded in multiple bytes in a UTF-8
//   string.  `advanceRaw` assumes the string is valid UTF-8, while
//   `advanceIfValid` checks the input for validity and stops advancing if a
//   sequence is encountered that is not valid UTF-8.
// * `numCodePointsIfValid` and `numCodePointsRaw`, which return the number of
//   Unicode code points in a UTF-8 string.  Note that `numCodePointsIfValid`
//   both validates a (candidate) UTF-8 string and counts the number of
//   Unicode code points that it contains.
// * `numBytesIfValid`, which returns the number of bytes a specified number
//   of Unicode code points occupy in a UTF-8 string.
// * `getByteSize`, which returns the length of a single UTF-8 encoded
//   character.
// * `CodePointValue`, which returns the integral value of a single UTF-8
//   encoded character.
// * `appendUtf8Character`, which appends a single Unicode code point to a
//   UTF-8 string.
//
// Embedded null bytes are allowed in strings that are accompanied by an
// explicit length argument.  Naturally, null-terminated C-style strings cannot
// contain embedded null code points.
//
// The UTF-8 format is described in the RFC 3629 document at:
// ```
// http://tools.ietf.org/html/rfc3629
// ```
// and in Wikipedia at:
// ```
// http://en.wikipedia.org/wiki/Utf-8
// ```
//
///Empty Input Strings
///-------------------
// The utility functions provided by this component consider the empty string
// to be valid UTF-8.  For those functions that take input as a
// `(pointer, length)` pair, if `0 == pointer` and `0 == length`, then the
// input is interpreted as a valid, empty string.  However, if `0 == pointer`
// and `0 != length`, the behavior is undefined.  All such functions have a
// counterpart that takes a lone pointer to a null-terminated (C-style) string.
// The behavior is always undefined if 0 is supplied for that lone pointer.
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Validating Strings and Counting Unicode Code Points
/// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// In this usage example, we will encode some Unicode code points in UTF-8
// strings and demonstrate those that are valid and those that are not.
//
// First, we build an unquestionably valid UTF-8 string:
// ```
// bsl::string string;
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00);
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x856);
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a');
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa);
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xfff);
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w');
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd);
// bdlde::Utf8Util::appendUtf8CodePoint(&string, '.');
// bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n');
// ```
// Then, we check its validity and measure its length:
// ```
// assert(true == bdlde::Utf8Util::isValid(string.data(), string.length()));
// assert(true == bdlde::Utf8Util::isValid(string.c_str()));
//
// assert(   9 == bdlde::Utf8Util::numCodePointsRaw(string.data(),
//                                                  string.length()));
// assert(   9 == bdlde::Utf8Util::numCodePointsRaw(string.c_str()));
// ```
// Next, we encode a lone surrogate value, `0xd8ab`, that we encode as the raw
// 3-byte sequence "\xed\xa2\xab" to avoid validation:
// ```
// bsl::string stringWithSurrogate = string + "\xed\xa2\xab";
//
// assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.data(),
//                                          stringWithSurrogate.length()));
// assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.c_str()));
// ```
// Then, we cannot use `numCodePointsRaw` to count the code points in
// `stringWithSurrogate`, since the behavior of that method is undefined unless
// the string is valid.  Instead, the `numCodePointsIfValid` method can be used
// on strings whose validity we are uncertain of:
// ```
// const char *invalidPosition = 0;
//
// bsls::Types::IntPtr rc;
// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                            stringWithSurrogate.data(),
//                                            stringWithSurrogate.length());
// assert(rc < 0);
// assert(bdlde::Utf8Util::k_SURROGATE == rc);
// assert(invalidPosition == stringWithSurrogate.data() + string.length());
//
// invalidPosition = 0;  // reset
//
// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                            stringWithSurrogate.c_str());
// assert(rc < 0);
// assert(bdlde::Utf8Util::k_SURROGATE == rc);
// assert(invalidPosition == stringWithSurrogate.data() + string.length());
// ```
// Now, we encode 0, which is allowed.  However, note that we cannot use any
// interfaces that take a null-terminated string for this case:
// ```
// bsl::string stringWithNull = string;
// stringWithNull += '\0';
//
// assert(true == bdlde::Utf8Util::isValid(stringWithNull.data(),
//                                         stringWithNull.length()));
//
// assert(  10 == bdlde::Utf8Util::numCodePointsRaw(stringWithNull.data(),
//                                                  stringWithNull.length()));
// ```
// Finally, we encode `0x3a` (`:`) as an overlong value using 2 bytes, which is
// not valid UTF-8 (since `:` can be "encoded" in 1 byte):
// ```
// bsl::string stringWithOverlong = string;
// stringWithOverlong += static_cast<char>(0xc0);        // start of 2-byte
//                                                       // sequence
// stringWithOverlong += static_cast<char>(0x80 | ':');  // continuation byte
//
// assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.data(),
//                                          stringWithOverlong.length()));
// assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.c_str()));
//
// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                            stringWithOverlong.data(),
//                                            stringWithOverlong.length());
// assert(rc < 0);
// assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);
// assert(invalidPosition == stringWithOverlong.data() + string.length());
//
// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                            stringWithOverlong.c_str());
// assert(rc < 0);
// assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);
// assert(invalidPosition == stringWithOverlong.data() + string.length());
// ```
//
///Example 2: Advancing Over a Given Number of Code Points
///- - - - - - - - - - - - - - - - - - - - - - - - - - - -
// In this example, we will use the various `advance` functions to advance
// through a UTF-8 string.
//
// First, build the string using `appendUtf8CodePoint`, keeping track of how
// many bytes are in each Unicode code point:
// ```
// bsl::string string;
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00);        // 3 bytes
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1ff);         // 2 bytes
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a');           // 1 byte
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa);      // 4 bytes
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd);       // 4 bytes
// string += "\xe3\x8f\xfe";           // 3 bytes (invalid 3-byte sequence,
//                                     // the first 2 bytes are valid but the
//                                     // last continuation byte is invalid)
// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w');           // 1 byte
// bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n');          // 1 byte
// ```
// Then, declare a few variables we'll need:
// ```
// bsls::Types::IntPtr  rc;
// int                  status;
// const char          *result;
// const char *const start = string.c_str();
// ```
// Next, try advancing 2 code points, then 3, then 4, observing that the value
// returned is the number of Unicode code points advanced.  Note that since
// we're only advancing over valid UTF-8, we can use either `advanceRaw` or
// `advanceIfValid`:
// ```
// rc = bdlde::Utf8Util::advanceRaw(              &result, start, 2);
// assert(2 == rc);
// assert(3 + 2 == result - start);
//
// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 2);
// assert(0 == status);
// assert(2 == rc);
// assert(3 + 2 == result - start);
//
// rc = bdlde::Utf8Util::advanceRaw(             &result, start, 3);
// assert(3 == rc);
// assert(3 + 2 + 1 == result - start);
//
// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 3);
// assert(0 == status);
// assert(3 == rc);
// assert(3 + 2 + 1 == result - start);
//
// rc = bdlde::Utf8Util::advanceRaw(             &result, start, 4);
// assert(4 == rc);
// assert(3 + 2 + 1 + 4 == result - start);
//
// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 4);
// assert(0 == status);
// assert(4 == rc);
// assert(3 + 2 + 1 + 4 == result - start);
// ```
// Then, try advancing by more code points than are present using
// `advanceIfValid`, and wind up stopping when we encounter invalid input.  The
// behavior of `advanceRaw` is undefined if it is used on invalid input, so we
// cannot use it here.  Also note that we will stop at the beginning of the
// invalid Unicode code point, and not at the first incorrect byte, which is
// two bytes later:
// ```
// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);
// assert(0 != status);
// assert(5 == rc);
// assert(3 + 2 + 1 + 4 + 4                 == result - start);
// assert(static_cast<int>(string.length()) >  result - start);
// ```
// Now, doctor the string to replace the invalid code point with a valid one,
// so the string is entirely correct UTF-8:
// ```
// string[3 + 2 + 1 + 4 + 4 + 2] = static_cast<char>(0x8a);
// ```
// Finally, advance using both functions by more code points than are in the
// string and in both cases wind up at the end of the string.  Note that
// `advanceIfValid` does not return an error (non-zero) value to `status` when
// it encounters the end of the string:
// ```
// rc = bdlde::Utf8Util::advanceRaw(             &result, start, INT_MAX);
// assert(8 == rc);
// assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1     == result - start);
// assert(static_cast<int>(string.length()) == result - start);
//
// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);
// assert(0 == status);
// assert(8 == rc);
// assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1     == result - start);
// assert(static_cast<int>(string.length()) == result - start);
// ```
//
///Example 3: Validating UTF-8 Read from a `bsl::streambuf`
/// - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// In this usage example, we will demonstrate reading and validating UTF-8
// from a stream.
//
// We write a function to read valid UTF-8 to a `bsl::string`.  We don't know
// how long the input will be, so we don't know how long to make the string
// before we start.  We will grow the string in small, 32-byte increments.
// ```
// /// Read valid UTF-8 from the specified streambuf `sb` to the specified
// /// `output`.  Return 0 if the input was exhausted without encountering
// /// any invalid UTF-8, and a non-zero value otherwise.  If invalid UTF-8
// /// is encountered, log a message describing the problem after loading
// /// all the valid UTF-8 preceding it into `output`.  Note that after the
// /// call, in no case will `output` contain any invalid UTF-8.
// int utf8StreambufToString(bsl::string    *output,
//                           bsl::streambuf *sb)
// {
//     enum { k_READ_LENGTH = 32 };
//
//     output->clear();
//     while (true) {
//         bsl::size_t len = output->length();
//         output->resize(len + k_READ_LENGTH);
//         int status;
//         IntPtr numBytes = bdlde::Utf8Util::readIfValid(&status,
//                                                        &(*output)[len],
//                                                        k_READ_LENGTH,
//                                                        sb);
//         BSLS_ASSERT(0 <= numBytes);
//         BSLS_ASSERT(numBytes <= k_READ_LENGTH);
//
//         output->resize(len + numBytes);
//         if (0 < status) {
//             // Buffer was full before the end of input was encountered.
//             // Note that `numBytes` may be up to 3 bytes less than
//             // `k_READ_LENGTH`.
//
//             BSLS_ASSERT(k_READ_LENGTH - 4 < numBytes);
//
//             // Go on to grow the string and get more input.
//
//             continue;
//         }
//         else if (0 == status) {
//             // Success!  We've reached the end of input without
//             // encountering any invalid UTF-8.
//
//             return 0;                                              // RETURN
//         }
//         else {
//             // Invalid UTF-8 encountered; the value of `status` indicates
//             // the exact nature of the problem.  `numBytes` returned from
//             // the above call indicated the number of valid UTF-8 bytes
//             // read before encountering the invalid UTF-8.
//
//             BSLS_LOG_ERROR("Invalid UTF-8 error %s at position %u.\n",
//                            bdlde::Utf8Util::toAscii(status),
//                            static_cast<unsigned>(output->length()));
//
//             return -1;                                             // RETURN
//         }
//     }
// }
// ```

#include <bdlscm_version.h>

#include <bsls_assert.h>
#include <bsls_libraryfeatures.h>
#include <bsls_review.h>
#include <bsls_types.h>

#include <bsl_cstddef.h>
#include <bsl_cstdlib.h>
#include <bsl_iosfwd.h>
#include <bsl_streambuf.h>
#include <bsl_string.h>

#include <string>

namespace BloombergLP {

namespace bdlde {
                              // ===============
                              // struct Utf8Util
                              // ===============

/// This struct provides a namespace for static methods used for validating
/// UTF-8 strings, for counting the number of Unicode code points in them,
/// for advancing pointers through UTF-8 strings by a specified number of
/// Unicode code points, for counting the number of bytes a UTF-8 leading
/// substring occupies, for counting the number of bytes in a UTF-8
/// character, and for appending a Unicode character to a UTF-8 string.
struct Utf8Util {

    // PUBLIC TYPES
    typedef bsls::Types::size_type size_type;
    typedef bsls::Types::IntPtr    IntPtr;
    typedef bsls::Types::UintPtr   UintPtr;
    typedef bsls::Types::Uint64    Uint64;

    /// Default code point to be substituted for errors.
    enum { k_ERROR_CODE_POINT = 0xfffd };

    /// Enumerate the error status values that are returned (possibly
    /// through an out parameter) from some methods in this utility.  Note
    /// that some of the functions in this `struct` have a return value
    /// that is non-negative on success, and one of these values when an
    /// error occurs, so all of these values must be negative to distinguish
    /// them from a "success" value.
    enum ErrorStatus {

        k_END_OF_INPUT_TRUNCATION       = -1,
           // The end of input was reached partway through a multibyte UTF-8
           // sequence.

        k_UNEXPECTED_CONTINUATION_OCTET = -2,
           // A continuation byte was encountered when not within a multibyte
           // sequence.

        k_NON_CONTINUATION_OCTET        = -3,
           // A non-continuation byte was encountered where a continuation byte
           // was expected.

        k_OVERLONG_ENCODING             = -4,
           // The encoded Unicode value could have been encoded in a sequence
           // of fewer bytes.

        k_INVALID_INITIAL_OCTET         = -5,
           // A sequence began with an octet with its 5 highest-order bits all
           // set, which is always invalid in UTF-8.

        k_VALUE_LARGER_THAN_0X10FFFF    = -6,
           // A value larger than 0x10FFFF was encoded.

        k_SURROGATE                     = -7
           // Illegal occurrence of Unicode code point reserved for surrogate
           // values in UTF-16.  Note that all surrogate values are illegal as
           // Unicode code points.
    };

    // CLASS METHODS

    /// Advance past 0 or more consecutive *valid* Unicode code points at
    /// the beginning of the specified `string`, until either the specified
    /// `numCodePoints` have been traversed, or the terminating null byte or
    /// invalid UTF-8 is encountered (whichever occurs first), and return
    /// the number of Unicode code points traversed.  Set the specified
    /// `*status` to 0 if no invalid UTF-8 is encountered, and to a value
    /// from the `ErrorStatus` `enum` otherwise.  Set the specified
    /// `*result` to the address of the byte immediately following the last
    /// valid code point traversed, or to `string` if `string` is empty or
    /// `numCodePoints` is 0.  `string` is necessarily null-terminated, so
    /// it cannot contain embedded null bytes.  The behavior is undefined
    /// unless `0 <= numCodePoints`.  Note that the value returned will be
    /// in the range `[0 .. numCodePoints]`.  Also note that `string` may
    /// contain less than `bsl::strlen(string)` Unicode code points.
    static IntPtr advanceIfValid(int         *status,
                                 const char **result,
                                 const char  *string,
                                 IntPtr       numCodePoints);

    /// Advance past 0 or more consecutive *valid* Unicode code points at
    /// the beginning of the specified `string` having the specified
    /// `length` (in bytes), until either the specified `numCodePoints` or
    /// `length` bytes have been traversed, or invalid UTF-8 is encountered
    /// (whichever occurs first), and return the number of Unicode code
    /// points traversed.  Set the specified `*status` to 0 if no invalid
    /// UTF-8 is encountered, and to a value from the `ErrorStatus` `enum`
    /// otherwise.  Set the specified `*result` to the address of the byte
    /// immediately following the last valid code point traversed, or to
    /// `string` if `length` or `numCodePoints` is 0.  `string` need not be
    /// null-terminated and can contain embedded null bytes, and `string`
    /// may be null if `0 == length` (see {Empty Input Strings}).  The
    /// behavior is undefined unless `0 <= numCodePoints`.  Note that the
    /// value returned will be in the range `[0 .. numCodePoints]`.  Also
    /// note that `string` may contain less than `length` Unicode code
    /// points.
    static IntPtr advanceIfValid(int         *status,
                                 const char **result,
                                 const char  *string,
                                 size_type    length,
                                 IntPtr       numCodePoints);

    /// Advance past 0 or more consecutive *valid* Unicode code points at
    /// the beginning of the specified `string`, until either the specified
    /// `numCodePoints` bytes or the whole `string` have been traversed, or
    /// invalid UTF-8 is encountered (whichever occurs first), and return
    /// the number of Unicode code points traversed.  Set the specified
    /// `*status` to 0 if no invalid UTF-8 is encountered, and to a value
    /// from the `ErrorStatus` `enum` otherwise.  Set the specified
    /// `*result` to the address of the byte immediately following the last
    /// valid code point traversed, or to `string` if its length or
    /// `numCodePoints` is 0.  `string` need not be null-terminated and can
    /// contain embedded null bytes.  The behavior is undefined unless
    /// `0 <= numCodePoints`.  Note that the value returned will be in the
    /// range `[0 .. numCodePoints]`.  Also note that `string` may contain
    /// less than `string.length()` Unicode code points.
    static IntPtr advanceIfValid(int                      *status,
                                 const char              **result,
                                 const bsl::string_view&   string,
                                 IntPtr                    numCodePoints);

    /// Advance past 0 or more consecutive Unicode code points at the
    /// beginning of the specified `string`, until either the specified
    /// `numCodePoints` bytes have been traversed or the terminating null
    /// byte is encountered (whichever occurs first), and return the number
    /// of Unicode code points traversed.  Set the specified `*result` to
    /// the address of the byte immediately following the last code point
    /// traversed, or to `string` if `string` is empty or `numCodePoints` is
    /// 0.  `string` is necessarily null-terminated, so it cannot contain
    /// embedded null bytes.  The behavior is undefined unless `string`
    /// contains valid UTF-8 and `0 <= numCodePoints`.  Note that the value
    /// returned will be in the range `[0 .. numCodePoints]`.  Also note
    /// that `string` may contain less than `bsl::strlen(string)` Unicode
    /// code points.
    static IntPtr advanceRaw(const char **result,
                             const char  *string,
                             IntPtr       numCodePoints);

    /// Advance past 0 or more consecutive Unicode code points at the
    /// beginning of the specified `string` having the specified `length`
    /// (in bytes), until either the specified `numCodePoints` or `length`
    /// bytes have been traversed (whichever occurs first), and return the
    /// number of Unicode code points traversed.  Set the specified
    /// `*result` to the address of the byte immediately following the last
    /// code point traversed, or to `string` if `length` or `numCodePoints`
    /// is 0.  `string` need not be null-terminated and can contain embedded
    /// null bytes, and `string` may be null if `0 == length` (see {Empty
    /// Input Strings}).  The behavior is undefined unless the initial
    /// `length` bytes of `string` contain valid UTF-8 and
    /// `0 <= numCodePoints`.  Note that the value returned will be in the
    /// range `[0 .. numCodePoints]`.  Also note that `string` may contain
    /// less than `length` Unicode code points.
    static IntPtr advanceRaw(const char **result,
                             const char  *string,
                             size_type    length,
                             IntPtr       numCodePoints);

    /// Advance past 0 or more consecutive Unicode code points at the
    /// beginning of the specified `string`, until either the specified
    /// `numCodePoints` bytes or the whole string have been traversed
    /// (whichever occurs first), and return the number of Unicode code
    /// points traversed.  Set the specified `*result` to the address of the
    /// byte immediately following the last code point traversed, or to
    /// `string` if `length` or `numCodePoints` is 0.  `string` need not be
    /// null-terminated and can contain embedded null bytes.  The behavior
    /// is undefined unless `string` contains only valid UTF-8 characters
    /// and `0 <= numCodePoints`.  Note that the value returned will be in
    /// the range `[0 .. numCodePoints]`.  Also note that `string` may
    /// contain less than `length` Unicode code points.
    static IntPtr advanceRaw(const char              **result,
                             const bsl::string_view&   string,
                             IntPtr                    numCodePoints);

    /// Append the UTF-8 encoding of the specified Unicode `codePoint` to
    /// the specified `output` string.  Return 0 on success, and a non-zero
    /// value otherwise.
    ///
    /// @DEPRECATED: Use `appendUtf8CodePoint` instead.
    static int appendUtf8Character(bsl::string  *output,
                                   unsigned int  codePoint);

    /// Append the UTF-8 encoding of the specified Unicode `codePoint` to
    /// the specified `output` string.  Return 0 on success, and a non-zero
    /// value otherwise.
    static int appendUtf8CodePoint(bsl::string  *output,
                                   unsigned int  codePoint);
    static int appendUtf8CodePoint(std::string  *output,
                                   unsigned int  codePoint);
#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
    static int appendUtf8CodePoint(std::pmr::string  *output,
                                   unsigned int       codePoint);
#endif

    /// Return the numeric value of the UTF-8-encoded code point beginning
    /// at the specified `codePoint`.  The behavior is undefined unless
    /// `codePoint` is the address of the first byte of a valid UTF-8
    /// encoded character.
    static int codePointValue(const char *codePoint);

    /// Return the length (in bytes) of the UTF-8-encoded code point
    /// beginning at the specified `codePoint`.  The behavior is undefined
    /// unless `codePoint` is the address of the first byte of a valid UTF-8
    /// encoded character.  Note that the value returned will be in the
    /// range `[1 .. 4]`.  Also note that 1 is returned if `0 == *codePoint`
    /// since '\0' is a valid 1-byte encoding.
    ///
    /// @DEPRECATED: Use `numBytesInCodePoint` instead.
    static int getByteSize(const char *codePoint);

    /// Return the length (in bytes) of the UTF-8-encoded code point
    /// beginning at the specified `codePoint`.  The behavior is undefined
    /// unless `codePoint` is the address of the first byte of a valid UTF-8
    /// encoded character.  Note that the value returned will be in the
    /// range `[1 .. 4]`.  Also note that 1 is returned if `0 == *codePoint`
    /// since '\0' is a valid 1-byte encoding.
    static int numBytesInCodePoint(const char *codePoint);

    /// For the specified `byteOffset` in the specified `input`, load the
    /// offset's line number into the specified `lineNumber`, the column
    /// number into the specified `utf8Column`, and the byte offset for the
    /// start of the line into `startOfLineByteOffset`.  Optionally specify
    /// `lineDelimeter` used to the determine line separator.   If
    /// `lineDelimeter` is not supplied, lines are delimeted using '\n'.
    /// Return 0 on success, or a non-zero value if `location` cannot be
    /// found in `input` or if `input` contains non-UTF-8 characters.  The
    /// `utf8Column` is the number of UTF-8 code points between
    /// `startOfLineByteOffset` and `byteOffset`.
    static int getLineAndColumnNumber(Uint64         *lineNumber,
                                      Uint64         *utf8Column,
                                      Uint64         *startOfLineByteOffset,
                                      bsl::streambuf *input,
                                      Uint64          byteOffset);
    static int getLineAndColumnNumber(Uint64         *lineNumber,
                                      Uint64         *utf8Column,
                                      Uint64         *startOfLineByteOffset,
                                      bsl::streambuf *input,
                                      Uint64          byteOffset,
                                      char            lineDelimeter);

    /// Return `true` if the specified `string` contains valid UTF-8, and
    /// `false` otherwise.  `string` is necessarily null-terminated, so it
    /// cannot contain embedded null bytes.
    static bool isValid(const char *string);

    /// Return `true` if the specified `string` having the specified
    /// `length` (in bytes) contains valid UTF-8, and `false` otherwise.
    /// `string` need not be null-terminated and can contain embedded null
    /// bytes, and `string` may be null if `0 == length` (see {Empty Input
    /// Strings}).
    static bool isValid(const char *string, size_type length);

    /// Return `true` if the specified `string` contains valid UTF-8, and
    /// `false` otherwise.  `string` need not be null-terminated and can
    /// contain embedded null bytes.
    static bool isValid(const bsl::string_view& string);

    /// Return `true` if the specified `string` contains valid UTF-8, and
    /// `false` otherwise.  If `string` contains invalid UTF-8, load into
    /// the specified `invalidString` the address of the beginning of the
    /// first invalid UTF-8 sequence encountered; `invalidString` is
    /// unaffected if `string` contains only valid UTF-8.  `string` is
    /// necessarily null-terminated, so it cannot contain embedded null
    /// bytes.
    static bool isValid(const char **invalidString, const char *string);

    /// Return `true` if the specified `string` having the specified
    /// `length` (in bytes) contains valid UTF-8, and `false` otherwise.  If
    /// `string` contains invalid UTF-8, load into the specified
    /// `invalidString` the address of the byte after the last valid code
    /// point traversed; `invalidString` is unaffected if `string` contains
    /// only valid UTF-8.  `string` need not be null-terminated and can
    /// contain embedded null bytes, and `string` may be null if
    /// `0 == length` (see {Empty Input Strings}).
    static bool isValid(const char **invalidString,
                        const char  *string,
                        size_type    length);

    /// Return `true` if the specified `string` contains only valid UTF-8
    /// characters, and `false` otherwise.  If `string` contains invalid
    /// UTF-8, load into the specified `invalidString` the address of the
    /// byte after the last valid code point traversed; `invalidString` is
    /// unaffected if `string` contains only valid UTF-8.  `string` need not
    /// be null-terminated and can contain embedded null bytes.
    static bool isValid(const char              **invalidString,
                        const bsl::string_view&   string);

    /// If the specified `codePoint` (having at least the specified
    /// `numBytes`) refers to a valid UTF-8 code point then return `true`
    /// and load the specified `status` with the number of bytes in the
    /// code-point; otherwise, if `codePoint` is not a valid code-point,
    /// return `false` and load `status` with one of the (negative)
    /// `ErrorStatus` constants.  The behavior is undefined unless
    /// `numBytes > 0`.
    static bool isValidCodePoint(int        *status,
                                 const char *codePoint,
                                 size_type   numBytes);

    /// Return the length (in bytes) of the specified `numCodePoints` UTF-8
    /// encodings in the specified `string`, or a value less than 0 if
    /// `string` contains less than `numCodePoints` encodings.  The behavior
    /// is undefined unless `string` refers to valid UTF-8.  Note that
    /// `string` may contain more than `numCodePoints` encodings in which
    /// case the trailing ones are ignored.
    ///
    /// @DEPRECATED: Use `numBytesRaw` instead.
    static IntPtr numBytesIfValid(const bsl::string_view& string,
                                  IntPtr                  numCodePoints);

    /// Return the length (in bytes) of the specified `numCodePoints` UTF-8
    /// encodings in the specified `string`, or a value less than 0 if
    /// `string` contains less than `numCodePoints` encodings.  The behavior
    /// is undefined unless `string` refers to valid UTF-8.  Note that
    /// `string` may contain more than `numCodePoints` encodings in which
    /// case the trailing ones are ignored.
    static IntPtr numBytesRaw(const bsl::string_view& string,
                              IntPtr                  numCodePoints);

    /// Return the number of Unicode code points in the specified `string`.
    /// `string` is necessarily null-terminated, so it cannot contain
    /// embedded null bytes.  The behavior is undefined unless `string`
    /// contains valid UTF-8.  Note that `string` may contain less than
    /// `bsl::strlen(string)` Unicode code points.
    ///
    /// @DEPRECATED: Use `numCodePointsRaw` instead.
    static IntPtr numCharacters(const char *string);

    /// Return the number of Unicode code points in the specified `string`
    /// having the specified `length` (in bytes).  `string` need not be
    /// null-terminated and can contain embedded null bytes, and `string`
    /// may be null if `0 == length` (see {Empty Input Strings}).  The
    /// behavior is undefined unless `string` contains valid UTF-8.  Note
    /// that `string` may contain less than `length` Unicode code points.
    ///
    /// @DEPRECATED: Use `numCodePointsRaw` instead.
    static IntPtr numCharacters(const char *string, size_type length);

    /// Return the number of Unicode code points in the specified `string`
    /// if it contains valid UTF-8, with no effect on the specified
    /// `invalidString`.  Otherwise, return a negative value and load into
    /// `invalidString` the address of the byte after the last valid Unicode
    /// code point traversed.  `string` is necessarily null-terminated, so
    /// it cannot contain embedded null bytes.  Note that `string` may
    /// contain less than `bsl::strlen(string)` Unicode code points.
    ///
    /// @DEPRECATED: Use `numCodePointsIfValid` instead.
    static IntPtr numCharactersIfValid(const char **invalidString,
                                       const char  *string);

    /// Return the number of Unicode code points in the specified `string`
    /// having the specified `length` (in bytes) if `string` contains valid
    /// UTF-8, with no effect on the specified `invalidString`.  Otherwise,
    /// return a negative value and load into `invalidString` the address of
    /// the byte after the last valid Unicode code point traversed.
    /// `string` need not be null-terminated and may contain embedded null
    /// bytes, and `string` may be null if `0 == length` (see {Empty Input
    /// Strings}).  Note that `string` may contain less than `length`
    /// Unicode code points.
    ///
    /// @DEPRECATED: Use `numCodePointsIfValid` instead.
    static IntPtr numCharactersIfValid(const char **invalidString,
                                       const char  *string,
                                       size_type    length);

    /// Return the number of Unicode code points in the specified `string`.
    /// `string` is necessarily null-terminated, so it cannot contain
    /// embedded null bytes.  The behavior is undefined unless `string`
    /// contains valid UTF-8.  Note that `string` may contain less than
    /// `bsl::strlen(string)` Unicode code points.
    ///
    /// @DEPRECATED: Use `numCodePointsRaw` instead.
    static IntPtr numCharactersRaw(const char *string);

    /// Return the number of Unicode code points in the specified `string`
    /// having the specified `length` (in bytes).  `string` need not be
    /// null-terminated and can contain embedded null bytes, and `string`
    /// may be null if `0 == length` (see {Empty Input Strings}).  The
    /// behavior is undefined `string` contains valid UTF-8.  Note that
    /// `string` may contain less than `length` Unicode code points.
    ///
    /// @DEPRECATED: Use `numCodePointsRaw` instead.
    static IntPtr numCharactersRaw(const char *string, size_type length);

    /// Return the number of Unicode code points in the specified `string`
    /// if it contains valid UTF-8, with no effect on the specified
    /// `invalidString`.  Otherwise, return a value from the `ErrorStatus`
    /// `enum` (which are all negative) and load into `invalidString` the
    /// address of the byte after the last valid Unicode code point
    /// traversed.  `string` is necessarily null-terminated, so it cannot
    /// contain embedded null bytes.  Note that `string` may contain less
    /// than `bsl::strlen(string)` Unicode code points.
    static IntPtr numCodePointsIfValid(const char **invalidString,
                                       const char  *string);

    /// Return the number of Unicode code points in the specified `string`
    /// having the specified `length` (in bytes) if `string` contains valid
    /// UTF-8, with no effect on the specified `invalidString`.  Otherwise,
    /// return a value from the `ErrorStatus` `enum` (which are all
    /// negative) and load into `invalidString` the address of the byte
    /// after the last valid Unicode code point traversed.  `string` need
    /// not be null-terminated and may contain embedded null bytes, and
    /// `string` may be null if `0 == length` (see {Empty Input Strings}).
    /// Note that `string` may contain less than `length` Unicode code
    /// points.
    static IntPtr numCodePointsIfValid(const char **invalidString,
                                       const char  *string,
                                       size_type    length);

    /// Return the number of Unicode code points in the specified `string`
    /// if `string` contains valid UTF-8, with no effect on the specified
    /// `invalidString`.  Otherwise, return a value from the `ErrorStatus`
    /// `enum` (which are all negative) and load into `invalidString` the
    /// address of the byte after the last valid Unicode code point
    /// traversed.  `string` need not be null-terminated and may contain
    /// embedded null bytes.
    static IntPtr numCodePointsIfValid(const char              **invalidString,
                                       const bsl::string_view&   string);

    /// Return the number of Unicode code points in the specified `string`.
    /// `string` is necessarily null-terminated, so it cannot contain
    /// embedded null bytes.  The behavior is undefined unless `string`
    /// contains valid UTF-8.  Note that `string` may contain less than
    /// `bsl::strlen(string)` Unicode code points.
    static IntPtr numCodePointsRaw(const char *string);

    /// Return the number of Unicode code points in the specified `string`
    /// having the specified `length` (in bytes).  `string` need not be
    /// null-terminated and can contain embedded null bytes, and `string`
    /// may be null if `0 == length` (see {Empty Input Strings}).  The
    /// behavior is undefined unless `string` contains valid UTF-8.  Note
    /// that `string` may contain less than `length` Unicode code points.
    static IntPtr numCodePointsRaw(const char *string, size_type length);

    /// Return the number of Unicode code points in the specified `string`.
    /// `string` need not be null-terminated and can contain embedded null
    /// bytes.  The behavior is undefined unless `string` contains valid
    /// UTF-8.
    static IntPtr numCodePointsRaw(const bsl::string_view& string);

    /// Read from the specified `input` and copy *valid* UTF-8 (only) to the
    /// specified `outputBuffer` having the specified `outputBufferLength`
    /// (in bytes).  Load the specified `status` with:
    /// * 0 if `input` reached `eof` without encountering any invalid UTF-8
    ///   or prematurely exhausting `outputBuffer`.
    /// * A positive value if `input` was not completely read due to
    ///   `outputBuffer` being filled (or nearly filled) without
    ///   encountering any invalid UTF-8.
    /// * A negative value from `ErrorStatus` if invalid UTF-8 was
    ///   encountered (without having written the invalid sequence to
    ///   `outputBuffer`).
    /// Return the number of bytes of valid UTF-8 written to 'outputBuffer.
    /// If no invalid UTF-8 is encountered, or if `input` supports
    /// `sputbackc` with a putback buffer capacity of at least 4 bytes,
    /// `input` will be left positioned at the end of the valid UTF-8 read,
    /// otherwise, `input` will be left in an unspecified state.  The
    /// behavior is undefined unless `4 <= outputBufferLength`.  Note that
    /// this function will stop reading `input` when less than 4 bytes of
    /// space remain in `outputBuffer` to prevent the possibility of a
    /// 4-byte UTF-8 sequence being truncated partway through.
    static size_type readIfValid(int            *status,
                                 char           *outputBuffer,
                                 size_type       outputBufferLength,
                                 bsl::streambuf *input);

    /// Translate the specified `input`, which is UTF-8, replacing error
    /// sequences in `input` with `utf32ErrorCode` translated to UTF-8, and
    /// return the number of errors found, putting the result in `*output`.
    /// Any previous contents of `*output` are discarded.  If `utf32ErrorCode`
    /// is 0, error sequences will be simply deleted.  The behavior is
    /// undefined unless `utf32ErrorCode` is a valid unicode value for UTF-8.
    static size_type replaceErrors(
                 bsl::string             *output,
                 const bsl::string_view&  input,
                 unsigned int             utf32ErrorCode = k_ERROR_CODE_POINT);
    static size_type replaceErrors(
                 std::string             *output,
                 const bsl::string_view&  input,
                 unsigned int             utf32ErrorCode = k_ERROR_CODE_POINT);
#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
    static size_type replaceErrors(
                 std::pmr::string        *output,
                 const bsl::string_view&  input,
                 unsigned int             utf32ErrorCode = k_ERROR_CODE_POINT);
#endif

    /// Return the non-modifiable string representation of the `ErrorStatus`
    /// enumerator matching the specified `value`, if it exists, and "(*
    /// unrecognized value *)" otherwise.  The string representation of an
    /// enumerator that matches `value` is the enumerator name with the "k_"
    /// prefix elided.  Note that this method may be used to aid in
    /// interpreting status values that are returned from some methods in
    /// this utility.  See `ErrorStatus`.
    static const char *toAscii(IntPtr value);
};

                          // =======================
                          // struct Utf8Util_ImpUtil
                          // =======================

/// [**PRIVATE**] This struct provides a namespace for static methods used to
/// implement `Utf8Util`.  Note that the functions are not typically useful
/// for clients, and are primarily exposed to allow for more thorough
/// testing.
struct Utf8Util_ImpUtil {
    // CLASS METHODS

    typedef bsls::Types::Uint64    Uint64;
    typedef bsls::Types::size_type size_type;

    /// Given a string referred to by the specified `input`, return the number
    /// of bytes in first code point in the string, whether the code point is
    /// valid or invalid.  Note that if the string begins with an invalid
    /// header octet `(0xf8 == (*input & 0xf8))` only that octet is skipped,
    /// any following continuation octets are seen as separate errors.  The
    /// behavior is undefined if `input.empty()`.
    static size_type advancePastValidOrInvalidCodePoint(
                                                const bsl::string_view& input);

    /// For the specified `byteOffset` in the specified `input`, load the
    /// byte offset's line number into the specified `lineNumber`, the
    /// column number into the specified `utf8Column`, and the byte offset
    /// for the start of the line into the specified
    /// `startOfLineByteOffset`, using the specified `lineDelimeter` as the
    /// line separator, and using the specified `temporaryReadBuffer` (of
    /// the specified length `temporaryReadBufferNumBytes`) as a temporary
    /// buffer for reading.  Return 0 on success, or a non-zero value if
    /// `location` cannot be found in `input` or if `input` contains
    /// non-UTF-8 characters.  The `utf8Column` is the number of UTF-8 code
    /// points between `startOfLineByteOffset` and `byteOffset`.  The
    /// behavior is undefined unless `temporaryReadBuffer` refers to a valid
    /// buffer of at least `temporaryReadBufferNumBytes` bytes, and
    /// `temporaryReadBufferNumBytes` is greater than or equal to 4.
    static int getLineAndColumnNumber(
                                  Uint64         *lineNumber,
                                  Uint64         *utf8Column,
                                  Uint64         *startOfLineByteOffset,
                                  bsl::streambuf *input,
                                  Uint64          byteOffset,
                                  char            lineDelimeter,
                                  char           *temporaryReadBuffer,
                                  int             temporaryReadBufferNumBytes);
};

// ============================================================================
//                            INLINE DEFINITIONS
// ============================================================================

                              // ---------------
                              // struct Utf8Util
                              // ---------------

// CLASS METHODS
inline
Utf8Util::IntPtr Utf8Util::advanceIfValid(
                                       int                      *status,
                                       const char              **result,
                                       const bsl::string_view&   string,
                                       IntPtr                    numCodePoints)
{

    return advanceIfValid(status,
                          result,
                          string.data(),
                          string.length(),
                          numCodePoints);
}

inline
Utf8Util::IntPtr Utf8Util::advanceRaw(const char              **result,
                                      const bsl::string_view&   string,
                                      IntPtr                    numCodePoints)
{
    return advanceRaw(result, string.data(), string.length(), numCodePoints);
}

inline
int Utf8Util::appendUtf8Character(bsl::string  *output,
                                  unsigned int  codePoint)
{
    return appendUtf8CodePoint(output, codePoint);
}

inline
int Utf8Util::getByteSize(const char *codePoint)
{
    return numBytesInCodePoint(codePoint);
}

inline
int Utf8Util::getLineAndColumnNumber(Uint64         *lineNumber,
                                     Uint64         *utf8Column,
                                     Uint64         *startOfLineByteOffset,
                                     bsl::streambuf *input,
                                     Uint64          byteOffset)
{
    return getLineAndColumnNumber(lineNumber,
                                  utf8Column,
                                  startOfLineByteOffset,
                                  input,
                                  byteOffset,
                                  '\n');
}

inline
int Utf8Util::getLineAndColumnNumber(Uint64         *lineNumber,
                                     Uint64         *utf8Column,
                                     Uint64         *startOfLineByteOffset,
                                     bsl::streambuf *input,
                                     Uint64          byteOffset,
                                     char            lineDelimeter)
{
    enum { k_BUFFER_SIZE = 2048 };
    char buffer[k_BUFFER_SIZE];
    return Utf8Util_ImpUtil::getLineAndColumnNumber(lineNumber,
                                                    utf8Column,
                                                    startOfLineByteOffset,
                                                    input,
                                                    byteOffset,
                                                    lineDelimeter,
                                                    buffer,
                                                    k_BUFFER_SIZE);
}

inline
bool Utf8Util::isValid(const char *string)
{
    BSLS_ASSERT(string);

    const char *dummy = 0;
    return isValid(&dummy, string);
}

inline
bool Utf8Util::isValid(const char *string, size_type length)
{
    BSLS_ASSERT(string || 0 == length);

    const char *dummy = 0;
    return isValid(&dummy, string, length);
}

inline
bool Utf8Util::isValid(const bsl::string_view& string)
{
    const char *dummy = 0;
    return isValid(&dummy, string);
}

inline
Utf8Util::IntPtr Utf8Util::numBytesIfValid(
                                         const bsl::string_view& string,
                                         IntPtr                  numCodePoints)
{
    return numBytesRaw(string, numCodePoints);
}

inline
Utf8Util::IntPtr Utf8Util::numCharacters(const char *string)
{
    return numCodePointsRaw(string);
}

inline
Utf8Util::IntPtr Utf8Util::numCharacters(const char *string, size_type length)
{
    return numCodePointsRaw(string, length);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersIfValid(const char **invalidString,
                                                const char  *string)
{
    return numCodePointsIfValid(invalidString, string);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersIfValid(const char **invalidString,
                                                const char  *string,
                                                size_type    length)
{
    return numCodePointsIfValid(invalidString, string, length);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersRaw(const char *string)
{
    return numCodePointsRaw(string);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersRaw(const char *string,
                                            size_type   length)
{
    return numCodePointsRaw(string, length);
}

inline
Utf8Util::IntPtr Utf8Util::numCodePointsRaw(const bsl::string_view& string)
{
    return numCodePointsRaw(string.data(), string.length());
}

}  // close package namespace
}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Copyright 2015 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------
